Load libraries, connect Spark, load data

Turn columns into the right format

d1 = d1 %>% 
  mutate(host_id = as.numeric(host_id)) %>%
  mutate(latitude = as.numeric(latitude)) %>%
  mutate(longitude = as.numeric(longitude)) %>%
  mutate(price = as.numeric(price)) %>%
  mutate(minimum_nights = as.numeric(minimum_nights)) %>%
  mutate(number_of_reviews = as.numeric(number_of_reviews)) %>%
  mutate(reviews_per_month = as.numeric(reviews_per_month)) %>%
  mutate(calculated_host_listings_count = as.numeric(calculated_host_listings_count)) %>%
  mutate(availability_365 = as.numeric(availability_365)) %>%
  na.omit() %>%
  rename(desc = name,ID = host_id, listing_host = calculated_host_listings_count, availability = availability_365)
## * Dropped 7 rows with 'na.omit' (46941 => 46934)
length = sdf_nrow(d1)

Single variable exploration - Hunting outliers

Distribution

We’ll start by plotting distribution of all numerical variables.

#histogram
plot_histogram = function(col, data = d1) {
  plot_data = data %>% sdf_read_column(col) %>% as.numeric() %>% as.data.frame()
  colnames(plot_data) = col
  
  plot = ggplot(plot_data, aes_string(x = col))+
  geom_histogram(bins = 30, fill="#1F77B4")+
  theme_few()
  labs(title = col,
       x = "",
       y = "")+
  theme(plot.title = element_text(hjust = 0.5, size=9),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())
  
  return(plot)
}

plot_box = function(col, data = d1) {
  plot_data = data %>% sdf_read_column(col) %>% as.numeric() %>% as.data.frame()
  colnames(plot_data) = col
  
  plot = ggplot(plot_data, aes_string(y = col))+
    geom_boxplot()+
    theme_few()+
    labs(title = col,
         x = "",
         y = "")+
    theme(plot.title = element_text(hjust = 0.5, size=9))
  
  return(plot)
}

Loop over numeric columns and plot histograms

numeric_columns = colnames(d1)[c(5,6,8,9,10,12:14)]

histograms = list()
for (i in 1:length(numeric_columns)){
  hist = plot_histogram(numeric_columns[i])
  histograms[[numeric_columns[i]]] = hist
}

#arrange histograms in 2x4 grid
do.call("grid.arrange", c(histograms, ncol = 2))

#save the grid as png
hist_grid = do.call("arrangeGrob", c(histograms, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/histogram_grid.png"), hist_grid, width = 7, height = 7)

Boxplots

boxplots = list()
for (i in 1:length(numeric_columns)){
  box = plot_box(numeric_columns[i])
  boxplots[[numeric_columns[i]]] = box
}

#arrange histograms in 2x4 grid
do.call("grid.arrange", c(boxplots, ncol = 2))

#save the grid as png
box_grid = do.call("arrangeGrob", c(boxplots, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/boxplot_grid.png"), box_grid)
## Saving 7 x 5 in image

Availability

We’ll start by looking for nonsense values (e.g. availability more than 365) and explore whether it needs transformations, excluding outliers etc.

d1 %>%
  sdf_describe("availability")
## # Source: spark<?> [?? x 2]
##   summary availability      
##   <chr>   <chr>             
## 1 count   46934             
## 2 mean    110.44541270720586
## 3 stddev  130.9620570919105 
## 4 min     0.0               
## 5 max     365.0
#look at its histogram in detail
histograms$availability

boxplots$availability

#there is at least one listing that is never available
cat("How many accomodations are never available?",
    d1 %>% filter(availability==0) %>% sdf_nrow())
## How many accomodations are never available? 17209
#on the other hand, how many are always available
cat("\nHow many accomodations are always available?",
    d1 %>% filter(availability==365) %>% sdf_nrow())
## 
## How many accomodations are always available? 1199

It seems most listing are available only for very narrow time windows. However, there is quite high variation and so the mean and standard deviations are shifted away from the mode. Altough log-transform might help to achieve normal distribution, we’d loose data as there are many 0 values (log(0)=infinity).

Minimum nights

d1 %>%
  sdf_describe("minimum_nights")
## # Source: spark<?> [?? x 2]
##   summary minimum_nights    
##   <chr>   <chr>             
## 1 count   46934             
## 2 mean    7.115609153279073 
## 3 stddev  20.634444444815273
## 4 min     1.0               
## 5 max     1250.0
#look at histogram and boxplot
histograms$minimum_nights

boxplots$minimum_nights

cat("How many accomodations have minimum nights a year or more?",
    d1 %>% filter(minimum_nights >= 365) %>% sdf_nrow())
## How many accomodations have minimum nights a year or more? 41
cat("How many accomodations have minimum nights more than mean + 2.5*standard deviation?",
d1 %>% filter(minimum_nights >= mean(minimum_nights, na.rm = T)+2.5*sd(minimum_nights, na.rm = T)) %>% sdf_nrow())
## Warning: `lang_name()` is deprecated as of rlang 0.2.0.
## Please use `call_name()` instead.
## This warning is displayed once per session.
## Warning: `lang()` is deprecated as of rlang 0.2.0.
## Please use `call2()` instead.
## This warning is displayed once per session.
## How many accomodations have minimum nights more than mean + 2.5*standard deviation? 427

There are 427 datapoints that are more than 2.5 SDs away from mean, it’s safe to say that these are outliers, we’ll remove them later for modelling purposes.

#d1 = d1 %>%
#  filter(minimum_nights <= mean(minimum_nights, na.rm = T)+2.5*sd(minimum_nights, na.rm = T))

#replot histogram
#plot_histogram("minimum_nights")
#plot_box("minimum_nights")

There are still some possible outliers but the spread is significantly lower. Regression models should be able to handle extrapolating these more extreme values.

Number of reviews

d1 %>% sdf_describe("number_of_reviews")
## # Source: spark<?> [?? x 2]
##   summary number_of_reviews
##   <chr>   <chr>            
## 1 count   46934            
## 2 mean    23.18696467379725
## 3 stddev  44.71422440863874
## 4 min     0.0              
## 5 max     629.0
#look at plots
histograms$number_of_reviews

boxplots$number_of_reviews

We can see some extreme values again.

d1 %>%
  filter(number_of_reviews >= mean(number_of_reviews, na.rm = T)+2.5*sd(number_of_reviews, na.rm = T)) %>%
  sdf_nrow()
## [1] 1688
d1 %>%
  filter(number_of_reviews == 0) %>%
  sdf_nrow()
## [1] 9678

Reviews per month

d1 %>% sdf_describe("reviews_per_month")
## # Source: spark<?> [?? x 2]
##   summary reviews_per_month
##   <chr>   <chr>            
## 1 count   46934            
## 2 mean    1.073917629010965
## 3 stddev  1.585515640503751
## 4 min     0.0              
## 5 max     58.5
histograms$reviews_per_month

boxplots$reviews_per_month

Most listings receive approx 1 review/month with a few extremes.

Listings per host + host_ID

d1 %>% sdf_describe("listing_host")
## # Source: spark<?> [?? x 2]
##   summary listing_host      
##   <chr>   <chr>             
## 1 count   46934             
## 2 mean    7.341202539736652 
## 3 stddev  33.595713063100774
## 4 min     1.0               
## 5 max     327.0
histograms$listing_host

This feature is a bit misleading since there are duplicates (listings from the same host). Let’s recompute the statistics by looking at unique values.

listings = d1 %>% select(ID, listing_host) %>% distinct() %>% arrange(desc(listing_host))

listings %>% sdf_describe("listing_host")
## # Source: spark<?> [?? x 2]
##   summary listing_host      
##   <chr>   <chr>             
## 1 count   36081             
## 2 mean    1.3045092985227682
## 3 stddev  2.8070101327169055
## 4 min     1.0               
## 5 max     327.0
#replot histogram and boxplot
(hist_listings = plot_histogram("listing_host", listings))

(box_listings = plot_box("listing_host", listings))

#update grids of plots
histograms[["listing_host"]] = hist_listings
boxplots[["listing_host"]] = box_listings

#save to png again
box_grid = do.call("arrangeGrob", c(boxplots, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/boxplot_grid.png"), box_grid)
## Saving 7 x 5 in image
hist_grid = do.call("arrangeGrob", c(histograms, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/histogram_grid.png"), hist_grid, width = 7, height = 7)

Most hosts have only 1 listing on average. However, there are clearly some who have sort of a network of listings. Let’s look at the top 10.

#collect listings to R
listing_data  = sdf_collect(listings)
listing_data$ID = as.factor(listing_data$ID)

#plot trend of owning airbnb
(top_hosts = ggplot(listing_data[1:10,], aes(x = reorder(ID, -listing_host), y = listing_host, fill = ID))+
  geom_bar(stat = "identity")+
  theme_few()+
  scale_fill_tableau()+
  guides(fill = F)+
  labs(x = "",
       y = "Listings")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

ggsave(paste0(VISUALIZATIONS, "/top_hosts_listings.png"), top_hosts)
## Saving 7 x 5 in image
#IDs for later with more than 10
topID = listing_data %>% filter(listing_host>=10) %>% select(ID)
topID = as.numeric(as.character(topID$ID))

Price

d1 %>% sdf_describe("price")
## # Source: spark<?> [?? x 2]
##   summary price             
##   <chr>   <chr>             
## 1 count   46934             
## 2 mean    154.30706950185368
## 3 stddev  240.57117624168677
## 4 min     0.0               
## 5 max     10000.0
histograms$price

boxplots$price

There are some ridiculously expensive listings. On the other hand, there’s at least one listing which is offered for free. To explore the distribution of more common prices, we’ll remove these extremes for now.

cat("There are", d1 %>% filter(price == 0) %>% sdf_nrow(), "listings offered for free.")
## There are 10 listings offered for free.
#Let's at their descriptions
d1 %>% filter(price == 0) %>% select(desc) #these mostly look like it's a some invalid data
## # Source: spark<?> [?? x 1]
##    desc                                              
##    <chr>                                             
##  1 Huge Brooklyn Brownstone Living, Close to it all. 
##  2 MARTIAL LOFT 3: REDEMPTION (upstairs, 2nd room)   
##  3 Sunny, Quiet Room in Greenpoint                   
##  4 Modern apartment in the heart of Williamsburg     
##  5 Spacious comfortable master bedroom with nice view
##  6 Contemporary bedroom in brownstone with nice view 
##  7 Cozy yet spacious private brownstone bedroom      
##  8 the best you can find                             
##  9 Coliving in Brooklyn! Modern design / Shared room 
## 10 Best Coliving space ever! Shared room.
#remove those free places immediately
d1 = d1 %>% filter(price > 0)

#Let's look at descriptions of the most expensive listings
d1 %>% filter(price > 5000) %>% select(price,desc) %>% arrange(desc(price))
## # Source:     spark<?> [?? x 2]
## # Ordered by: desc(price)
##    price desc                                              
##    <dbl> <chr>                                             
##  1 10000 Luxury 1 bedroom apt. -stunning Manhattan views   
##  2 10000 Furnished room in Astoria apartment               
##  3 10000 1-BR Lincoln Center                               
##  4  9999 2br - The Heart of NYC: Manhattans Lower East Side
##  5  9999 Spanish Harlem Apt                                
##  6  9999 Quiet, Clean, Lit @ LES & Chinatown               
##  7  8500 Beautiful/Spacious 1 bed luxury flat-TriBeCa/Soho 
##  8  8000 Film Location                                     
##  9  7703 East 72nd Townhouse by (Hidden by Airbnb)         
## 10  7500 Gem of east Flatbush                              
## # … with more rows
prices = d1 %>% filter(price < 600) 

plot_histogram("price", prices)

Categorical variables

Summarise the categorical columns - count values per category

(neigbourhood_gr_count = d1 %>%
   group_by(neighbourhood_group) %>%
   summarise(count = n()) %>%
   arrange(desc(count)) %>%
   rename(borough = neighbourhood_group) %>%
   sdf_collect() %>%
   as.data.frame())
##     borough count
## 1 Manhattan 21543
## 2  Brooklyn 19843
## 3    Queens  5190
## 4     Bronx   348
neigbourhood_count = d1 %>%
  group_by(neighbourhood) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  sdf_collect() %>%
  as.data.frame()
neigbourhood_count[1:10,]
##         neighbourhood count
## 1        Williamsburg  3912
## 2  Bedford-Stuyvesant  3699
## 3              Harlem  2642
## 4            Bushwick  2457
## 5     Upper West Side  1968
## 6      Hell's Kitchen  1951
## 7        East Village  1849
## 8     Upper East Side  1791
## 9       Crown Heights  1559
## 10            Midtown  1541
(room_count = d1 %>%
  group_by(room_type) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  sdf_collect() %>%
  as.data.frame())
##         room_type count
## 1 Entire home/apt 24572
## 2    Private room 21278
## 3     Shared room  1074

Plot the counts

categ1 = ggplot(neigbourhood_gr_count, aes(x = reorder(borough, -count), y = count, fill = borough))+
  geom_bar(stat = "identity")+
  theme_few()+
  scale_fill_tableau()+
  labs(x = "")+
  guides(fill = F)

categ2 = ggplot(neigbourhood_count[1:10,], aes(x = reorder(neighbourhood, -count), y = count, fill = neighbourhood))+
  geom_bar(stat = "identity")+
  theme_few()+
  scale_fill_tableau()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(x = "")+
  guides(fill = F)

categ3 = ggplot(room_count, aes(x = reorder(room_type, -count), y = count, fill = room_type))+
  geom_bar(stat = "identity")+
  theme_few()+
  scale_fill_tableau()+
  labs(x = "")+
  guides(fill = F)

#arrange plots in nice layout
(categorical_counts = (categ1 + categ3)/categ2)

ggsave(paste0(VISUALIZATIONS, "/categorical_counts.png"), categorical_counts)
## Saving 7 x 5 in image

Price vs. rest

Becaue we’re mainly interested in prices, we’ll now explore the relationship of price and other variables.

Price vs. Borough

price_bor = d1 %>%
   group_by(neighbourhood_group) %>%
   summarise(mean = mean(price),
             sd = sd(price)) %>%
  sdf_collect()
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
(price_bor_bar = ggplot(price_bor, aes(x = reorder(neighbourhood_group, -mean), y = mean, fill = neighbourhood_group))+
  geom_col(color = "black", size = 0.3)+
  geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd))+
  theme_few()+
  scale_fill_tableau()+
  guides(fill = F)+
  labs(x = "",
       y = "price")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1)))

ggsave(paste0(VISUALIZATIONS, "/price_borough.png"), price_bor_bar)
## Saving 7 x 5 in image

Price vs. Neighbourhood

price_nei = d1 %>%
  group_by(neighbourhood) %>%
  summarise(mean = mean(price),
            sd = sd(price)) %>%
  arrange(desc(mean)) %>%
  sdf_collect()

(price_nei_point = ggplot(price_nei, aes(x = reorder(neighbourhood, -mean), y = mean))+
  geom_point()+
  theme_few()+
  labs(x = "",
       y = "price"))+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 4))

(price_nei_bar = ggplot(price_nei[1:10,], aes(x = reorder(neighbourhood, -mean), y = mean, fill = neighbourhood))+
  geom_col(color = "black", size = 0.3)+
  geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd), width = 0.5)+
  theme_few()+
  scale_fill_tableau()+
  guides(fill = F)+
  labs(x = "",
       y = "price")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

ggsave(paste0(VISUALIZATIONS, "/price_neighbourhoood_all.png"), price_nei_point)
## Saving 7 x 5 in image
ggsave(paste0(VISUALIZATIONS, "/price_neighbourhoood_top10.png"), price_nei_bar)
## Saving 7 x 5 in image

Price vs. Room type

(price_type = d1 %>%
  group_by(room_type) %>%
  summarise(mean = mean(price),
            sd = sd(price)) %>%
  arrange(desc(mean)) %>%
  sdf_collect())
## # A tibble: 3 x 3
##   room_type        mean    sd
##   <chr>           <dbl> <dbl>
## 1 Entire home/apt 213.   285.
## 2 Private room     90.3  157.
## 3 Shared room      71.2  103.
(price_type_bar = ggplot(price_type, aes(x = reorder(room_type, -mean), y = mean, fill = room_type))+
  geom_col(color = "black", size = 0.3)+
  geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd), width = 0.5)+
  theme_few()+
  scale_fill_tableau()+
  guides(fill = F)+
  labs(x = "",
       y = "price")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1)))

Price vs Borough/Room type

Let’s look whether there is interaction between borough and room type that explains the price variations.

borough_type = d1 %>%
  mutate(borough_type = paste(neighbourhood_group, room_type, sep = "-")) %>%
  group_by(borough_type) %>%
  summarise(mean = mean(price),
            sd = sd(price)) %>%
  sdf_collect() %>%
  tidyr::separate(borough_type, c("borough", "room_type"), sep = "-")

(borough_type_bar = ggplot(borough_type, aes(fill=room_type, y=mean, x=borough))+
  geom_col(position="dodge", color = "black", size = 0.3)+
  geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd), position = position_dodge(0.9), width = 0.5)+
  theme_few()+
  scale_fill_tableau(name = "Room type")+
  labs(x = "",
       y = "price"))

Borough/type vs price layout

(bor_type_layout = (price_bor_bar + price_type_bar) / borough_type_bar)

ggsave(paste0(VISUALIZATIONS, "/borough_type.png"), bor_type_layout, width = 7, height = 6)

Maps

First, we create a background map of New York City + some detailed maps of boroughs

register_google(key = "AIzaSyB_CZHsa2vTarsq5zlDxB-CbMnCGj7xa1s")


#New York
lat = d1 %>% summarise(lat = mean(latitude)) %>% sdf_collect %>% as.numeric()
lon = d1 %>% summarise(lon = mean(longitude)) %>% sdf_collect %>% as.numeric()

nyc = get_googlemap(center = c(lon = lon, lat = lat),
                     zoom = 11, scale = 2,
                     maptype = "roadmap",
                     color = "color")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.728528,-73.953523&zoom=11&size=640x640&scale=2&maptype=roadmap&key=xxx-CbMnCGj7xa1s

Price map

price_data = d1 %>% 
  select(latitude, longitude, price, neighbourhood_group) %>%
  filter(price <500)

(price_map = ggmap(nyc)+
  geom_point(data = price_data,aes(x = longitude, y = latitude, color = price), alpha = 0.8)+
  theme_map()+
  scale_color_gradient2_tableau())
## Warning: Removed 50 rows containing missing values (geom_point).

ggsave(paste0(VISUALIZATIONS, "/price_map.png"), price_map)
## Saving 7 x 5 in image
## Warning: Removed 50 rows containing missing values (geom_point).

Network of hosts

multi_hosts = d1 %>% filter(ID %in% topID) %>% sdf_collect()

lat = multi_hosts %>% summarise(lat = mean(latitude)) %>% as.numeric()
lon = multi_hosts %>% summarise(lon = mean(longitude)) %>% as.numeric()

nyc = get_googlemap(center = c(lon = lon, lat = lat),
                     zoom = 12, scale = 2,
                     maptype = "roadmap",
                     color = "color")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.739068,-73.968391&zoom=12&size=640x640&scale=2&maptype=roadmap&key=xxx-CbMnCGj7xa1s
ggmap(nyc) +
  geom_line(data = multi_hosts,aes(x = longitude, y = latitude, color = factor(ID)))+
  geom_point(data = multi_hosts,aes(x = longitude, y = latitude, color = factor(ID)),size=0.3)+
  guides(color=F)+
  theme_void()+
  scale_color_manual(values = rainbow(length(unique(multi_hosts$ID))))
## Warning: Removed 119 rows containing missing values (geom_path).
## Warning: Removed 134 rows containing missing values (geom_point).

cols = rep("#002a75", 36079)
(network = ggplot(d1,aes(x = longitude, y = latitude, color=factor(ID)))+
  geom_line(aes(alpha=price))+
  geom_point(aes(alpha=price), size=0.01)+
  guides(color=F, alpha=F)+
  theme_void()+
  scale_color_manual(values = cols)+
  theme(plot.background = element_rect(fill="black", color="black")))

golden_ratio = (1+sqrt(5))/2
n = 7
ggsave(paste0(VISUALIZATIONS, "/network_full.png"), network, dpi=600, height = n, width = n*golden_ratio)